#Load dataset
mh <- read.csv("~/Desktop/survey2.csv", header = TRUE, stringsAsFactors = TRUE)
str(mh)
## 'data.frame': 1433 obs. of 63 variables:
## $ self_employed : int 0 0 0 1 0 0 0 0 0 1 ...
## $ num_employees : Factor w/ 7 levels "0","1 to 5","100-500",..: 4 6 6 1 6 7 4 7 4 1 ...
## $ tech_company : Factor w/ 3 levels "0","1","n/a": 2 2 2 3 1 2 2 2 1 3 ...
## $ primary_role : Factor w/ 3 levels "0","1","n/a": 3 3 3 3 2 3 3 3 2 3 ...
## $ mental_health_coverage : Factor w/ 3 levels "No","Yes","n/a": 3 1 1 3 2 2 3 2 3 3 ...
## $ mental_health_options : Factor w/ 4 levels "N/A","No","Yes",..: 4 3 1 4 3 4 2 3 2 4 ...
## $ mental_health_formally_discussed : Factor w/ 3 levels "No","Yes","n/a": 1 2 1 3 1 1 1 1 1 3 ...
## $ mental_health_resources : Factor w/ 3 levels "No","Yes","n/a": 1 2 1 3 1 2 1 2 1 3 ...
## $ anonymity_protected : Factor w/ 3 levels "No","Yes","n/a": 3 2 3 3 1 2 3 2 3 3 ...
## $ medical_leave : Factor w/ 6 levels "Neither easy nor difficult",..: 5 3 1 6 1 3 3 5 4 6 ...
## $ mental_health_negative : Factor w/ 3 levels "No","Yes","n/a": 1 1 3 3 2 2 1 1 2 3 ...
## $ physical_health_negative : Factor w/ 3 levels "No","Yes","n/a": 1 1 1 3 3 2 1 1 2 3 ...
## $ mental_health_comfort_coworker : Factor w/ 3 levels "No","Yes","n/a": 3 3 3 3 3 3 3 3 2 3 ...
## $ mental_health_comfort_supervisor : Factor w/ 3 levels "No","Yes","n/a": 2 2 3 3 1 2 2 2 3 3 ...
## $ mental_health_taken_seriously : Factor w/ 3 levels "No","Yes","n/a": 3 2 3 3 1 1 2 3 1 3 ...
## $ coworker_negative_consequences : Factor w/ 3 levels "No","Yes","n/a": 1 1 1 3 1 2 1 1 1 3 ...
## $ private_med_coverage : Factor w/ 3 levels "0","1","n/a": 3 3 3 2 3 3 3 3 3 2 ...
## $ resources_awareness : Factor w/ 4 levels "I know some",..: 4 4 4 3 4 4 4 4 4 1 ...
## $ reveal_diagnosis_clients_or_business : Factor w/ 6 levels "No, because it doesn't matter",..: 6 6 6 4 6 6 6 6 6 1 ...
## $ revealed_negative_consequences_CB : Factor w/ 4 levels "N/A","No","Yes",..: 4 4 4 4 4 4 4 4 4 1 ...
## $ reveal_diagnosis_coworkers : Factor w/ 5 levels "No, because it doesn't matter",..: 5 5 5 3 5 5 5 5 5 3 ...
## $ revealed_negative_consequences_CW : Factor w/ 3 levels "No","Yes","n/a": 3 3 3 3 3 3 3 3 3 1 ...
## $ productivity_effected : Factor w/ 3 levels "No","Yes","n/a": 3 3 3 2 3 3 3 3 3 2 ...
## $ percentage : Factor w/ 5 levels "1-25%","26-50%",..: 5 5 5 1 5 5 5 5 5 1 ...
## $ previous_employer : int 1 1 1 1 1 1 1 1 1 1 ...
## $ prevemp_mental_health_coverage : Factor w/ 5 levels "","I don't know",..: 3 5 3 4 2 3 4 4 2 4 ...
## $ prevemp_mental_health_options : Factor w/ 5 levels "","I was aware of some",..: 3 2 3 3 3 5 2 2 3 2 ...
## $ prevemp_mental_health_formally_discussed: Factor w/ 5 levels "","I don't know",..: 2 3 3 3 4 3 3 4 4 3 ...
## $ prevemp_mental_health_resources : Factor w/ 4 levels "","None did",..: 2 3 3 2 2 2 3 3 2 2 ...
## $ prevemp_anonymity_protected : Factor w/ 5 levels "","I don't know",..: 2 5 2 2 2 2 2 4 2 2 ...
## $ prevemp_mental_health_negative : Factor w/ 5 levels "","I don't know",..: 4 3 2 4 4 5 3 4 5 4 ...
## $ prevemp_physical_health_negative : Factor w/ 4 levels "","None of them",..: 2 2 3 3 3 3 2 3 4 3 ...
## $ prevemp_mental_health_coworker : Factor w/ 4 levels "","No, at none of my previous employers",..: 3 2 3 3 2 2 3 3 2 3 ...
## $ prevemp_mental_health_comfort_supervisor: Factor w/ 5 levels "","I don't know",..: 4 4 2 4 4 3 5 4 3 4 ...
## $ prevemp_mental_health_taken_seriously : Factor w/ 5 levels "","I don't know",..: 2 4 2 2 4 3 4 4 3 2 ...
## $ prevemp_coworker_negative_consequences : Factor w/ 4 levels "","None of them",..: 2 2 3 3 3 3 2 3 2 2 ...
## $ phsyical_issue_interview : Factor w/ 3 levels "Maybe","No","Yes": 1 1 3 3 1 3 3 2 1 3 ...
## $ why_physical : Factor w/ 1087 levels ""," Don't trust potential employers to not judge.",..: 1 664 907 1065 99 539 328 269 734 202 ...
## $ mental_health_interview : Factor w/ 3 levels "Maybe","No","Yes": 1 2 3 1 2 1 3 2 1 1 ...
## $ why_mental : Factor w/ 1082 levels ""," I don't want to poison the well... people are prejudiced and in denial about their own issues and vulnerabilit"| __truncated__,..: 1 950 848 1079 259 577 359 804 707 562 ...
## $ career_hurt : Factor w/ 5 levels "Maybe","No, I don't think it would",..: 1 2 1 4 4 4 4 1 1 1 ...
## $ viewed_negatively_by_coworkers : Factor w/ 5 levels "Maybe","No, I don't think they would",..: 2 2 1 1 1 1 2 1 5 2 ...
## $ share_with_family : Factor w/ 6 levels "Neutral","Not applicable to me (I do not have a mental illness)",..: 5 5 5 1 5 5 2 5 5 6 ...
## $ observed_poor_handling : Factor w/ 5 levels "Maybe/Not sure",..: 3 3 1 3 4 4 3 5 5 3 ...
## $ observations_lead_less_likely_to_reveal : Factor w/ 5 levels "","Maybe","N/A",..: 1 1 5 1 5 4 1 2 4 1 ...
## $ family_history : Factor w/ 3 levels "I don't know",..: 2 3 2 2 3 2 2 3 3 3 ...
## $ ever_had_mental_disorder : Factor w/ 3 levels "Maybe","No","Yes": 3 3 1 3 3 2 2 3 3 3 ...
## $ currently_have_mental_disorder : Factor w/ 3 levels "Maybe","No","Yes": 2 3 2 3 3 3 2 3 3 3 ...
## $ if_yes_what : Factor w/ 129 levels "","Addictive Disorder",..: 1 12 1 12 67 68 1 69 85 12 ...
## $ if_maybe_what : Factor w/ 100 levels "","Addictive Disorder",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ medical_prof_diagnosis : Factor w/ 2 levels "No","Yes": 2 2 1 2 2 1 1 2 2 2 ...
## $ what_conditions : Factor w/ 117 levels "","ADD (w/o Hyperactivity)",..: 5 11 1 11 5 1 1 54 70 11 ...
## $ sought_prof_treatment : int 0 1 1 1 1 1 0 1 1 1 ...
## $ treatment_affects_work : Factor w/ 5 levels "Never","Not applicable to me",..: 2 4 2 5 5 2 2 5 4 4 ...
## $ no_treatment_affects_work : Factor w/ 5 levels "Never","Not applicable to me",..: 2 5 2 5 5 3 2 3 3 3 ...
## $ age : int 39 29 38 43 43 42 30 37 44 30 ...
## $ gender : Factor w/ 72 levels ""," Female","AFAB",..: 30 64 31 64 15 30 28 54 15 30 ...
## $ country : Factor w/ 53 levels "Afghanistan",..: 50 51 50 50 51 50 51 51 51 51 ...
## $ state : Factor w/ 48 levels "","Alabama","Alaska",..: 1 13 1 1 13 1 41 45 5 17 ...
## $ country_work : Factor w/ 53 levels "Afghanistan",..: 50 51 50 50 51 50 51 51 51 51 ...
## $ state_work : Factor w/ 49 levels "","Alabama","Alaska",..: 1 14 1 1 14 1 42 46 5 18 ...
## $ work_position : Factor w/ 264 levels "Back-end Developer",..: 1 9 1 206 88 66 1 32 255 154 ...
## $ remote_work : Factor w/ 3 levels "Always","Never",..: 3 2 1 3 3 3 3 1 3 1 ...
#head(mh$gender)
#levels(mh$gender)
#summary(mh$gender)
#Standardise gender
female = levels(mh$gender)[grep('(fe).*|^f$|fm|woman|female', levels(mh$gender), ignore.case = T, perl = T)]
male = levels(mh$gender)[grep('^m$|\bmale| male|mail|male |^male| man|\bman|^man$|masculine|dude|^male$', levels(mh$gender), ignore.case = T, perl = T)]
mh$sex = mh$gender
mh$sex[mh$sex %in% female] = 'female'
mh$sex[mh$sex %in% male] = 'male'
mh$sex[!(mh$sex %in% c('male', 'female'))] = NA
summary(mh$sex %in% female)
## Mode FALSE TRUE
## logical 1088 345
summary(mh$sex %in% male)
## Mode FALSE TRUE
## logical 375 1058
summary(!(mh$sex %in% c(male, female)))
## Mode FALSE TRUE
## logical 1403 30
mh$sex = droplevels(mh$sex)
table(mh$sex)
##
## female male
## 345 1058
#Omit the NA data
mh <- mh %>% filter(sex !="NA")
Frequency Distribution and Histogram of all factors
# Omit the self-employeed data
noofemployees <- mh %>% filter(num_employees !="0")
# Make them to be ascending order
noofemployees$num_employees <- factor(noofemployees$num_employees, levels = c("1 to 5","6 to 25","26-100","100-500","500-1000","More than 1000"))
# Plot the distribution
noemployees <- ggplot(noofemployees,aes(num_employees))
noemployees + geom_bar() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Number of Employees") + xlab("Number of Employees") + theme_bw()
# Omit the self-employeed data
techcomp <- mh %>% filter(tech_company !="n/a")
# Plot the distribution
techcom <- ggplot(techcomp,aes(tech_company))
techcom + geom_bar() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Tech Company Or Not") + xlab("0 is No, 1 is Yes") + theme_bw()
# Omit the self-employeed data
primaryrole <- mh %>% filter(primary_role !="n/a")
# Plot the distribution
prirole <- ggplot(primaryrole,aes(primary_role))
prirole + geom_bar() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Primary Role is Tech Or Not") + xlab("0 is No, 1 is Yes") + theme_bw()
# Plot the distribution
CHMD <- ggplot(mh,aes(currently_have_mental_disorder))
CHMD + geom_bar() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Currently Have Mental Disorder Or Not") + xlab("") + theme_bw()
summary(mh$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 28.00 33.00 34.31 39.00 323.00
#Replace the incorrect data with median age
mh[which(mh$age == 3), "age"] <- 33
mh[which(mh$age == 323), "age"] <- 33
# Plot the distribution
ggplot(mh, aes(age)) + geom_histogram() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Ages") + xlab("Age") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot the distribution
CW <- ggplot(mh,aes(country_work))
CW + geom_bar() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Country Worked") + xlab("Country Worked") + theme(axis.text = element_text(angle = 90, hjust = 1))
# Plot the distribution
RW <- ggplot(mh,aes(remote_work))
RW + geom_bar() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Remote Work Or Not") + theme_bw()
# Group by currently have mental disorder
Company.size <- noofemployees %>% group_by(num_employees,currently_have_mental_disorder)
# Plot
ggplot(Company.size, aes(currently_have_mental_disorder)) + geom_bar() + facet_wrap(~num_employees, scales = "free_y") +
ggtitle("How many workers Current have mental disorder in different size company?") +
xlab("Currently with mental disorder?") +
theme_bw()
# Calculate counts and frequencies
detach("package:plyr", unload=TRUE)
## Warning: 'plyr' namespace cannot be unloaded:
## namespace 'plyr' is imported by 'scales', 'ggplot2' so cannot be unloaded
library(dplyr)
freq1 <- Company.size %>%
group_by(num_employees,currently_have_mental_disorder) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
# Plot Frequencies
pfreq1 <- ggplot(freq1, aes(x = num_employees, y = freq, fill = currently_have_mental_disorder)) + geom_bar(stat = "identity") + xlab("Company Size") + ylab("Frequency")
ggplotly(pfreq1)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# Is there any relationship between Company Size and Mental Disorder?
# H0:NO
# H1:YES
chisq.test(table(noofemployees$num_employees,noofemployees$currently_have_mental_disorder))
##
## Pearson's Chi-squared test
##
## data: table(noofemployees$num_employees, noofemployees$currently_have_mental_disorder)
## X-squared = 15.15, df = 10, p-value = 0.1267
Based the result of chi-squared, reject H0, there’s no relationship between company size and mental disorder
# Group by currently have mental disorder
Tech.company <- techcomp %>% group_by(sex,currently_have_mental_disorder)
# Plot
ggplot(Tech.company, aes(currently_have_mental_disorder)) + geom_bar() + facet_wrap(~tech_company, scales = "free_y") +
ggtitle("How many employees Current have mental disorder in tech company") +
xlab("Currently with mental disorder?") +
theme_bw()
# Calculate counts and frequencies
freq2 <- Tech.company %>%
group_by(sex,currently_have_mental_disorder) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
freq2
## # A tibble: 6 x 4
## # Groups: sex [2]
## sex currently_have_mental_disorder n freq
## <fct> <fct> <int> <dbl>
## 1 female Maybe 47 0.171
## 2 female No 82 0.298
## 3 female Yes 146 0.531
## 4 male Maybe 202 0.238
## 5 male No 359 0.423
## 6 male Yes 288 0.339
# Plot Frequencies
pfreq2 <- ggplot(freq2, aes(x = sex, y = freq, fill = currently_have_mental_disorder)) + geom_bar(stat = "identity") + xlab("Gender") + ylab("Frequency")
ggplotly(pfreq2)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
Female workers in tech company suffer more from mental disorder.
# Group by currently have mental disorder
Tech.role <- primaryrole %>% group_by(sex,currently_have_mental_disorder)
# Plot
ggplot(Tech.role, aes(currently_have_mental_disorder)) + geom_bar() + facet_wrap(~sex, scales = "free_y") +
ggtitle("How many tech workers current have mental disorder") +
xlab("Currently with mental disorder?") +
theme_bw()
# Calculate counts and frequencies
freq3 <- Tech.role %>%
group_by(sex,currently_have_mental_disorder) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
freq3
## # A tibble: 6 x 4
## # Groups: sex [2]
## sex currently_have_mental_disorder n freq
## <fct> <fct> <int> <dbl>
## 1 female Maybe 11 0.157
## 2 female No 23 0.329
## 3 female Yes 36 0.514
## 4 male Maybe 47 0.249
## 5 male No 78 0.413
## 6 male Yes 64 0.339
# Plot Frequencies
pfreq3 <- ggplot(freq3, aes(x = sex, y = freq, fill = currently_have_mental_disorder)) + geom_bar(stat = "identity") + xlab("Gender") + ylab("Frequency")
ggplotly(pfreq3)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
Female tech workers suffer more from mental disorder.
# Replace 99 with median 33
mh[which(mh$age == 99), "age"] <- 33
# Group by currently have mental disorder
Age.Re <- mh %>% group_by(age,currently_have_mental_disorder)
# Calculate counts and frequencies
library(dplyr)
freq4 <- Age.Re %>%
group_by(age,currently_have_mental_disorder) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
freq4
## # A tibble: 126 x 4
## # Groups: age [50]
## age currently_have_mental_disorder n freq
## <dbl> <fct> <int> <dbl>
## 1 15.0 No 1 1.00
## 2 17.0 No 1 1.00
## 3 19.0 Maybe 1 0.250
## 4 19.0 No 2 0.500
## 5 19.0 Yes 1 0.250
## 6 20.0 Maybe 2 0.333
## 7 20.0 No 3 0.500
## 8 20.0 Yes 1 0.167
## 9 21.0 Maybe 5 0.357
## 10 21.0 No 3 0.214
## # ... with 116 more rows
# Plot Frequencies
pfreq4 <- ggplot(freq4, aes(x = age, y = freq, fill = currently_have_mental_disorder)) + geom_bar(stat = "identity") + xlab("Age") + ylab("Frequency")
ggplotly(pfreq4)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# Is there any relationship between Company Size and Mental Disorder?
# H0:NO
# H1:YES
chisq.test(table(mh$age,mh$currently_have_mental_disorder))
## Warning in chisq.test(table(mh$age, mh$currently_have_mental_disorder)):
## Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(mh$age, mh$currently_have_mental_disorder)
## X-squared = 95.517, df = 98, p-value = 0.5522
Based the result of chi-squared, reject H0, there’s no relationship between age and mental disorder.